import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
sns.set_theme(style="whitegrid")
# log10(flow) ~ prop_users_dest + prop_users_orig + query_date + log10(distance) + indicator_vars
fit_df = pd.read_csv("~/Nextcloud/linkedin_recruiter/outputs/poisson_model_output_2021-02-15.csv")
# df = pd.read_csv("N:/johnson/linkedin_recruiter/outputs/model_output_2021-02-09.csv")
fit_df.loc[fit_df['cooks_dist'].isnull()]
| country_orig | country_dest | flow | users_orig | users_dest | iso3_orig | pop_orig | maxgdp_orig | maxhdi_orig | iso3_dest | ... | prop_users_dest | bin_maxhdi_orig | bin_maxhdi_dest | bin_maxgdp_orig | bin_maxgdp_dest | resids | sresids | preds | cooks_dist | hat_values | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1524 | Mongolia | Kyrgyzstan | 3 | 159965 | 99486 | mng | 3114000 | 12398.0 | 0.5 | kgz | ... | 0.016072 | Low-middle | Middle | Low-middle | Low-middle | 0.0 | NaN | 3.0 | NaN | 1.0 |
1 rows × 38 columns
# to make it easy to make plots later
fit_df = fit_df.loc[~(fit_df['cooks_dist'].isnull())]
# residuals vs. log10(distance)
fig = px.scatter(
fit_df.sort_values(by='query_date'), x=np.log10(fit_df['distance']), y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels={'x': 'log10(distance [km])'})
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in destination
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_dest', y='resids', color='dest_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. proportion of linkedin users in origin
fig = px.scatter(fit_df.sort_values(by='query_date'), x='prop_users_orig', y='resids', color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'])
fig.add_hline(y=0)
fig.show()
# residuals vs. predictions
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='preds', y=fit_df['resids'].abs(), color='orig_reg',
facet_col='query_date', facet_col_wrap=3, hover_data=['country_orig', 'country_dest'],
labels = {'y': 'abs(residuals)'})
fig.show()
# standardized residuals, "number of standard errors away from regression line"
# quantify how large the residuals are in standard deviation units
# residual = (obs - preds)
# residuals / standard deviation
# abs(std resid) > 3 is sometimes a threshold for outlier detection
# first, very positive residuals (underestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(
by='sresids', ascending=False
)[['country_orig', 'country_dest', 'bin_maxgdp_orig', 'bin_maxgdp_dest']].drop_duplicates().head(10)
| country_orig | country_dest | bin_maxgdp_orig | bin_maxgdp_dest | |
|---|---|---|---|---|
| 17340 | Brazil | Portugal | Middle | Middle-high |
| 18695 | Argentina | Spain | Middle | High |
| 39555 | India | Canada | Low | High |
| 37244 | India | United Arab Emirates | Low | High |
| 23645 | France | Switzerland | Middle-high | High |
| 24852 | Sweden | Denmark | Middle-high | High |
| 25374 | France | Luxembourg | Middle-high | High |
| 33425 | United States | United Kingdom | High | High |
| 33400 | India | United Kingdom | Low | High |
| 39557 | Nigeria | Canada | Low | High |
# and the very negative residuals (overestimate of y)
fit_df.loc[fit_df['sresids'].abs() > 3].sort_values(by='sresids')[['country_orig', 'country_dest', 'bin_maxgdp_orig', 'bin_maxgdp_dest']].drop_duplicates().head(10)
| country_orig | country_dest | bin_maxgdp_orig | bin_maxgdp_dest | |
|---|---|---|---|---|
| 35902 | India | China | Low | Middle |
| 18561 | India | Spain | Low | High |
| 39576 | United States | Canada | High | High |
| 33382 | Egypt | United Kingdom | Low-middle | High |
| 33553 | Egypt | United States | Low-middle | High |
| 17262 | India | Portugal | Low | Middle-high |
| 22313 | South Africa | India | Low-middle | Low-middle |
| 23656 | India | Switzerland | Low | High |
| 17998 | India | Italy | Low | High |
| 17980 | Egypt | Italy | Low-middle | High |
# number of predictors
P = 390
# number of observations
n = len(fit_df)
# hat-values, commonly used to measure leverage
hat_threshold = (2 * (P + 1))/n
print(hat_threshold)
# "influential" country pairs
fit_df.loc[fit_df['hat_values'] > hat_threshold].sort_values(
by='hat_values', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(10)
0.019285309132161088
| country_orig | country_dest | |
|---|---|---|
| 39335 | Gibraltar | Malta |
| 33690 | Israel | United States |
| 21714 | Iceland | Zimbabwe |
| 10440 | Cambodia | Thailand |
| 7772 | Tuvalu | Fiji |
| 28336 | Namibia | South Africa |
| 16143 | Guinea-Bissau | Cape Verde |
| 10959 | Cook Islands | Samoa |
| 11431 | Cambodia | Vietnam |
| 40349 | Isle of Man | Gibraltar |
# Cook's Distance is a combination of standardized residuals & leverage
cooks_threshold = 4 / (n - P - 1)
print(cooks_threshold)
# "influential" country pairs
fit_df.loc[fit_df['cooks_dist'] > cooks_threshold].sort_values(
by='cooks_dist', ascending=False
)[['country_orig', 'country_dest']].drop_duplicates().head(10)
9.960655411126052e-05
| country_orig | country_dest | |
|---|---|---|
| 17340 | Brazil | Portugal |
| 39555 | India | Canada |
| 39577 | United States | Canada |
| 37244 | India | United Arab Emirates |
| 35903 | India | China |
| 39557 | Nigeria | Canada |
| 18695 | Argentina | Spain |
| 23645 | France | Switzerland |
| 33431 | United States | United Kingdom |
| 18561 | India | Spain |
# using standard residuals, cook's distance, & hat values
fig = px.scatter(
fit_df.sort_values(by='query_date'), x='hat_values', y='sresids', size=np.sqrt(fit_df['cooks_dist']*10),
color='orig_reg', facet_col='query_date', facet_col_wrap=3,
hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'hat values', 'sresids': 'std. residuals'})
fig.show()
# let's zoom in on one date
sub_df = fit_df.query("query_date == '2020-11-19'")
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='orig_reg', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()
sub_df = fit_df.query("query_date == '2020-10-08'").sort_values(by='maxhdi_orig')
sub_df['bin_maxhdi_orig'] = sub_df['bin_maxhdi_orig'].fillna('Null')
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='maxhdi_orig', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()
sub_df = fit_df.query("query_date == '2020-10-08'").sort_values(by='maxhdi_dest')
sub_df['bin_maxhdi_dest'] = sub_df['bin_maxhdi_dest'].fillna('Null')
fig = px.scatter(
sub_df, x='hat_values', y='sresids', size=np.sqrt(sub_df['cooks_dist']*10),
color='maxhdi_dest', hover_data=['country_orig', 'country_dest'],
labels = {'hat_values': 'leverage (hat-values)', 'sresids': 'outliers (standardized residuals)'})
fig.add_hline(y=3, line_dash='dash')
fig.add_hline(y=-3, line_dash='dash')
fig.add_vline(x=hat_threshold, line_dash='dash')
fig.show()